import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('HR-Employee-Attrition.csv')
df.head()
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | Yes | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | No | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | ... | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | Yes | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | No | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | ... | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | No | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | ... | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 35 columns
df.shape
(1470, 35)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1470 entries, 0 to 1469 Data columns (total 35 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 1470 non-null int64 1 Attrition 1470 non-null object 2 BusinessTravel 1470 non-null object 3 DailyRate 1470 non-null int64 4 Department 1470 non-null object 5 DistanceFromHome 1470 non-null int64 6 Education 1470 non-null int64 7 EducationField 1470 non-null object 8 EmployeeCount 1470 non-null int64 9 EmployeeNumber 1470 non-null int64 10 EnvironmentSatisfaction 1470 non-null int64 11 Gender 1470 non-null object 12 HourlyRate 1470 non-null int64 13 JobInvolvement 1470 non-null int64 14 JobLevel 1470 non-null int64 15 JobRole 1470 non-null object 16 JobSatisfaction 1470 non-null int64 17 MaritalStatus 1470 non-null object 18 MonthlyIncome 1470 non-null int64 19 MonthlyRate 1470 non-null int64 20 NumCompaniesWorked 1470 non-null int64 21 Over18 1470 non-null object 22 OverTime 1470 non-null object 23 PercentSalaryHike 1470 non-null int64 24 PerformanceRating 1470 non-null int64 25 RelationshipSatisfaction 1470 non-null int64 26 StandardHours 1470 non-null int64 27 StockOptionLevel 1470 non-null int64 28 TotalWorkingYears 1470 non-null int64 29 TrainingTimesLastYear 1470 non-null int64 30 WorkLifeBalance 1470 non-null int64 31 YearsAtCompany 1470 non-null int64 32 YearsInCurrentRole 1470 non-null int64 33 YearsSinceLastPromotion 1470 non-null int64 34 YearsWithCurrManager 1470 non-null int64 dtypes: int64(26), object(9) memory usage: 402.1+ KB
df.describe()
| Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.0 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | ... | 1470.000000 | 1470.0 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 |
| mean | 36.923810 | 802.485714 | 9.192517 | 2.912925 | 1.0 | 1024.865306 | 2.721769 | 65.891156 | 2.729932 | 2.063946 | ... | 2.712245 | 80.0 | 0.793878 | 11.279592 | 2.799320 | 2.761224 | 7.008163 | 4.229252 | 2.187755 | 4.123129 |
| std | 9.135373 | 403.509100 | 8.106864 | 1.024165 | 0.0 | 602.024335 | 1.093082 | 20.329428 | 0.711561 | 1.106940 | ... | 1.081209 | 0.0 | 0.852077 | 7.780782 | 1.289271 | 0.706476 | 6.126525 | 3.623137 | 3.222430 | 3.568136 |
| min | 18.000000 | 102.000000 | 1.000000 | 1.000000 | 1.0 | 1.000000 | 1.000000 | 30.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 80.0 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 30.000000 | 465.000000 | 2.000000 | 2.000000 | 1.0 | 491.250000 | 2.000000 | 48.000000 | 2.000000 | 1.000000 | ... | 2.000000 | 80.0 | 0.000000 | 6.000000 | 2.000000 | 2.000000 | 3.000000 | 2.000000 | 0.000000 | 2.000000 |
| 50% | 36.000000 | 802.000000 | 7.000000 | 3.000000 | 1.0 | 1020.500000 | 3.000000 | 66.000000 | 3.000000 | 2.000000 | ... | 3.000000 | 80.0 | 1.000000 | 10.000000 | 3.000000 | 3.000000 | 5.000000 | 3.000000 | 1.000000 | 3.000000 |
| 75% | 43.000000 | 1157.000000 | 14.000000 | 4.000000 | 1.0 | 1555.750000 | 4.000000 | 83.750000 | 3.000000 | 3.000000 | ... | 4.000000 | 80.0 | 1.000000 | 15.000000 | 3.000000 | 3.000000 | 9.000000 | 7.000000 | 3.000000 | 7.000000 |
| max | 60.000000 | 1499.000000 | 29.000000 | 5.000000 | 1.0 | 2068.000000 | 4.000000 | 100.000000 | 4.000000 | 5.000000 | ... | 4.000000 | 80.0 | 3.000000 | 40.000000 | 6.000000 | 4.000000 | 40.000000 | 18.000000 | 15.000000 | 17.000000 |
8 rows × 26 columns
df.Attrition.value_counts()
No 1233 Yes 237 Name: Attrition, dtype: int64
df.Department.value_counts()
Research & Development 961 Sales 446 Human Resources 63 Name: Department, dtype: int64
df.isnull().any()
Age False Attrition False BusinessTravel False DailyRate False Department False DistanceFromHome False Education False EducationField False EmployeeCount False EmployeeNumber False EnvironmentSatisfaction False Gender False HourlyRate False JobInvolvement False JobLevel False JobRole False JobSatisfaction False MaritalStatus False MonthlyIncome False MonthlyRate False NumCompaniesWorked False Over18 False OverTime False PercentSalaryHike False PerformanceRating False RelationshipSatisfaction False StandardHours False StockOptionLevel False TotalWorkingYears False TrainingTimesLastYear False WorkLifeBalance False YearsAtCompany False YearsInCurrentRole False YearsSinceLastPromotion False YearsWithCurrManager False dtype: bool
df.isnull().sum()
Age 0 Attrition 0 BusinessTravel 0 DailyRate 0 Department 0 DistanceFromHome 0 Education 0 EducationField 0 EmployeeCount 0 EmployeeNumber 0 EnvironmentSatisfaction 0 Gender 0 HourlyRate 0 JobInvolvement 0 JobLevel 0 JobRole 0 JobSatisfaction 0 MaritalStatus 0 MonthlyIncome 0 MonthlyRate 0 NumCompaniesWorked 0 Over18 0 OverTime 0 PercentSalaryHike 0 PerformanceRating 0 RelationshipSatisfaction 0 StandardHours 0 StockOptionLevel 0 TotalWorkingYears 0 TrainingTimesLastYear 0 WorkLifeBalance 0 YearsAtCompany 0 YearsInCurrentRole 0 YearsSinceLastPromotion 0 YearsWithCurrManager 0 dtype: int64
df.drop(['EmployeeCount','EmployeeNumber','Over18','StandardHours'],axis=1,inplace=True)
df.shape
(1470, 31)
sns.countplot(x='Attrition',data=df)
<Axes: xlabel='Attrition', ylabel='count'>
sns.distplot(df['Age'])
C:\Users\Arun\AppData\Local\Temp\ipykernel_17120\3255828239.py:1: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df['Age'])
<Axes: xlabel='Age', ylabel='Density'>
plt.figure(figsize=(15,5))
sns.countplot(x='Age',hue='Attrition',data=df)
<Axes: xlabel='Age', ylabel='count'>
sns.barplot(data=df,x='Attrition',y='Age',hue='Gender')
<Axes: xlabel='Attrition', ylabel='Age'>
plt.pie(df.BusinessTravel.value_counts(),labels=df.BusinessTravel.unique(),autopct='%.2f%%')
plt.title("Business Travel")
Text(0.5, 1.0, 'Business Travel')
plt.pie(df.Department.value_counts(),labels=['R&D','Sales','HR'],autopct='%1.1f%%')
plt.title('Department')
Text(0.5, 1.0, 'Department')
plt.figure(figsize=(7,7))
plt.pie(df.EducationField.value_counts(),labels=['Life Sciences','Medical','Marketing','Technical Degree','Other','Human Resources'],autopct='%.2f%%')
plt.title("Business Travel")
Text(0.5, 1.0, 'Business Travel')
sns.scatterplot(data=df,x='Age',y='TotalWorkingYears')
<Axes: xlabel='Age', ylabel='TotalWorkingYears'>
plt.figure(figsize=(20,20))
sns.heatmap(df.corr(),annot=True)
C:\Users\Arun\AppData\Local\Temp\ipykernel_17120\271047654.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. sns.heatmap(df.corr(),annot=True)
<Axes: >
plt.figure(figsize=(30,30))
sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x25220cf2910>
<Figure size 3000x3000 with 0 Axes>
plt.figure(figsize=(15,7))
plt.subplot(3,3,1)
sns.boxplot(df.MonthlyIncome)
plt.ylabel("Monthly Income")
plt.subplot(3,3,2)
sns.boxplot(df.TotalWorkingYears)
plt.ylabel("Total Working Years")
plt.subplot(3,3,3)
sns.boxplot(df.TrainingTimesLastYear)
plt.ylabel("Training Times Last Year")
plt.subplot(3,3,4)
sns.boxplot(df.YearsAtCompany)
plt.ylabel("Years At Company")
plt.subplot(3,3,5)
sns.boxplot(df.YearsInCurrentRole)
plt.ylabel("Years In Current Role")
plt.subplot(3,3,6)
sns.boxplot(df.YearsSinceLastPromotion)
plt.ylabel("Years Since Last Promotion")
plt.subplot(3,3,7)
sns.boxplot(df.YearsWithCurrManager)
plt.ylabel("Years With Curr Manager")
plt.show()
q1 = df.MonthlyIncome.quantile(0.25)
q3 = df.MonthlyIncome.quantile(0.75)
IQR = q3-q1
IQR
5468.0
upper_limit = q3+1.5*IQR
df['MonthlyIncome'] = np.where(df['MonthlyIncome']>upper_limit,upper_limit,df['MonthlyIncome'])
sns.boxplot(df.MonthlyIncome)
<Axes: >
q1 = df.TotalWorkingYears.quantile(0.25)
q3 = df.TotalWorkingYears.quantile(0.75)
IQR = q3-q1
IQR
9.0
upper_limit = q3+1.5*IQR
df['TotalWorkingYears'] = np.where(df['TotalWorkingYears']>upper_limit,upper_limit,df['TotalWorkingYears'])
sns.boxplot(df['TotalWorkingYears'])
<Axes: >
q1 = df.TrainingTimesLastYear.quantile(0.25)
q3 = df.TrainingTimesLastYear.quantile(0.75)
IQR = q3-q1
IQR
1.0
upper_limit = q3+1.5*IQR
lower_limit = q1-1.5*IQR
df['TrainingTimesLastYear'] = np.where(df['TrainingTimesLastYear']>upper_limit,upper_limit,np.where(df['TrainingTimesLastYear']<lower_limit,lower_limit,df['TrainingTimesLastYear']))
sns.boxplot(df.TrainingTimesLastYear)
<Axes: >
q1 = df.YearsAtCompany.quantile(0.25)
q3 = df.YearsAtCompany.quantile(0.75)
IQR = q3-q1
IQR
6.0
upper_limit = q3+1.5*IQR
df['YearsAtCompany'] = np.where(df['YearsAtCompany']>upper_limit,upper_limit,df['YearsAtCompany'])
sns.boxplot(df.YearsAtCompany)
<Axes: >
q1 = df.YearsInCurrentRole.quantile(0.25)
q3 = df.YearsInCurrentRole.quantile(0.75)
IQR = q3-q1
IQR
5.0
upper_limit = q3+1.5*IQR
df['YearsInCurrentRole'] = np.where(df['YearsInCurrentRole']>upper_limit,upper_limit,df['YearsInCurrentRole'])
sns.boxplot(df.YearsInCurrentRole)
<Axes: >
q1 = df.YearsSinceLastPromotion.quantile(0.25)
q3 = df.YearsSinceLastPromotion.quantile(0.75)
IQR = q3-q1
IQR
3.0
upper_limit = q3+1.5*IQR
df['YearsSinceLastPromotion'] = np.where(df['YearsSinceLastPromotion']>upper_limit,upper_limit,df['YearsSinceLastPromotion'])
sns.boxplot(df.YearsSinceLastPromotion)
<Axes: >
q1 = df.YearsWithCurrManager.quantile(0.25)
q3 = df.YearsWithCurrManager.quantile(0.75)
IQR = q3-q1
IQR
5.0
upper_limit = q3+1.5*IQR
df['YearsWithCurrManager'] = np.where(df['YearsWithCurrManager']>upper_limit,upper_limit,df['YearsWithCurrManager'])
sns.boxplot(df.YearsWithCurrManager)
<Axes: >
X = df.drop(['Attrition'],axis=1)
X.head()
| Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EnvironmentSatisfaction | Gender | HourlyRate | ... | PerformanceRating | RelationshipSatisfaction | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 2 | Female | 94 | ... | 3 | 1 | 0 | 8.0 | 0.5 | 1 | 6.0 | 4.0 | 0.0 | 5.0 |
| 1 | 49 | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 3 | Male | 61 | ... | 4 | 4 | 1 | 10.0 | 3.0 | 3 | 10.0 | 7.0 | 1.0 | 7.0 |
| 2 | 37 | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 4 | Male | 92 | ... | 3 | 2 | 0 | 7.0 | 3.0 | 3 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 33 | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 4 | Female | 56 | ... | 3 | 3 | 0 | 8.0 | 3.0 | 3 | 8.0 | 7.0 | 3.0 | 0.0 |
| 4 | 27 | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | Male | 40 | ... | 3 | 4 | 1 | 6.0 | 3.0 | 3 | 2.0 | 2.0 | 2.0 | 2.0 |
5 rows × 30 columns
Y = df['Attrition']
Y.head()
0 Yes 1 No 2 Yes 3 No 4 No Name: Attrition, dtype: object
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
columns = ['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','OverTime']
X[columns] = X[columns].apply(le.fit_transform)
X.head()
| Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EnvironmentSatisfaction | Gender | HourlyRate | ... | PerformanceRating | RelationshipSatisfaction | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | 2 | 1102 | 2 | 1 | 2 | 1 | 2 | 0 | 94 | ... | 3 | 1 | 0 | 8.0 | 0.5 | 1 | 6.0 | 4.0 | 0.0 | 5.0 |
| 1 | 49 | 1 | 279 | 1 | 8 | 1 | 1 | 3 | 1 | 61 | ... | 4 | 4 | 1 | 10.0 | 3.0 | 3 | 10.0 | 7.0 | 1.0 | 7.0 |
| 2 | 37 | 2 | 1373 | 1 | 2 | 2 | 4 | 4 | 1 | 92 | ... | 3 | 2 | 0 | 7.0 | 3.0 | 3 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 33 | 1 | 1392 | 1 | 3 | 4 | 1 | 4 | 0 | 56 | ... | 3 | 3 | 0 | 8.0 | 3.0 | 3 | 8.0 | 7.0 | 3.0 | 0.0 |
| 4 | 27 | 2 | 591 | 1 | 2 | 1 | 3 | 1 | 1 | 40 | ... | 3 | 4 | 1 | 6.0 | 3.0 | 3 | 2.0 | 2.0 | 2.0 | 2.0 |
5 rows × 30 columns
from sklearn.preprocessing import MinMaxScaler
ms = MinMaxScaler()
X_Scaled = pd.DataFrame(ms.fit_transform(X),columns=X.columns)
X_Scaled.head()
| Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EnvironmentSatisfaction | Gender | HourlyRate | ... | PerformanceRating | RelationshipSatisfaction | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.547619 | 1.0 | 0.715820 | 1.0 | 0.000000 | 0.25 | 0.2 | 0.333333 | 0.0 | 0.914286 | ... | 0.0 | 0.000000 | 0.000000 | 0.280702 | 0.000 | 0.000000 | 0.333333 | 0.275862 | 0.000000 | 0.344828 |
| 1 | 0.738095 | 0.5 | 0.126700 | 0.5 | 0.250000 | 0.00 | 0.2 | 0.666667 | 1.0 | 0.442857 | ... | 1.0 | 1.000000 | 0.333333 | 0.350877 | 0.625 | 0.666667 | 0.555556 | 0.482759 | 0.133333 | 0.482759 |
| 2 | 0.452381 | 1.0 | 0.909807 | 0.5 | 0.035714 | 0.25 | 0.8 | 1.000000 | 1.0 | 0.885714 | ... | 0.0 | 0.333333 | 0.000000 | 0.245614 | 0.625 | 0.666667 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 3 | 0.357143 | 0.5 | 0.923407 | 0.5 | 0.071429 | 0.75 | 0.2 | 1.000000 | 0.0 | 0.371429 | ... | 0.0 | 0.666667 | 0.000000 | 0.280702 | 0.625 | 0.666667 | 0.444444 | 0.482759 | 0.400000 | 0.000000 |
| 4 | 0.214286 | 1.0 | 0.350036 | 0.5 | 0.035714 | 0.00 | 0.6 | 0.000000 | 1.0 | 0.142857 | ... | 0.0 | 1.000000 | 0.333333 | 0.210526 | 0.625 | 0.666667 | 0.111111 | 0.137931 | 0.266667 | 0.137931 |
5 rows × 30 columns
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X_Scaled,Y,test_size=0.2,random_state=0)
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)
(1176, 30) (294, 30) (1176,) (294,)
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train,y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression()
pred1 = lr.predict(x_test)
pred1
array(['No', 'No', 'No', 'No', 'Yes', 'No', 'Yes', 'No', 'No', 'No', 'No',
'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
'No', 'No', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No',
'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
'No', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No',
'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
'No', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No',
'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes',
'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No',
'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'Yes', 'No',
'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
'Yes', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No',
'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No'],
dtype=object)
y_test
442 No
1091 No
981 Yes
785 No
1332 Yes
...
1439 No
481 No
124 Yes
198 No
1229 No
Name: Attrition, Length: 294, dtype: object
from sklearn.metrics import accuracy_score,classification_report,roc_auc_score,roc_curve
accuracy_score(y_test,pred1)
0.8775510204081632
pd.crosstab(y_test,pred1)
| col_0 | No | Yes |
|---|---|---|
| Attrition | ||
| No | 241 | 4 |
| Yes | 32 | 17 |
print(classification_report(y_test,pred1))
precision recall f1-score support
No 0.88 0.98 0.93 245
Yes 0.81 0.35 0.49 49
accuracy 0.88 294
macro avg 0.85 0.67 0.71 294
weighted avg 0.87 0.88 0.86 294
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(x_train,y_train)
DecisionTreeClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier()
pred2 = dtc.predict(x_test)
pred2
array(['No', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'Yes',
'No', 'Yes', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No',
'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes',
'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No',
'No', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes',
'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No',
'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No',
'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No',
'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
'No', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'No', 'No',
'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'No',
'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'Yes',
'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No',
'No', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'No', 'No',
'No', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'No',
'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes',
'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'No',
'Yes', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No',
'No', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'Yes', 'No', 'No',
'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'No', 'Yes',
'No', 'No', 'Yes', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No',
'Yes', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
'No', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'Yes',
'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No'], dtype=object)
y_test
442 No
1091 No
981 Yes
785 No
1332 Yes
...
1439 No
481 No
124 Yes
198 No
1229 No
Name: Attrition, Length: 294, dtype: object
accuracy_score(y_test,pred2)
0.7891156462585034
pd.crosstab(y_test,pred2)
| col_0 | No | Yes |
|---|---|---|
| Attrition | ||
| No | 210 | 35 |
| Yes | 27 | 22 |
print(classification_report(y_test,pred2))
precision recall f1-score support
No 0.89 0.86 0.87 245
Yes 0.39 0.45 0.42 49
accuracy 0.79 294
macro avg 0.64 0.65 0.64 294
weighted avg 0.80 0.79 0.80 294
from sklearn.model_selection import GridSearchCV
parameter = {'criterion':['gini','entropy'],'splitter':['best','random'],'max_depth':[1,2,3,4,5],'max_features':['auto','sqrt','log2']}
grid_search = GridSearchCV(estimator=dtc,param_grid=parameter,cv=5,scoring='accuracy')
grid_search.fit(x_train,y_train)
C:\Users\Arun\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py:425: FitFailedWarning:
100 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\Arun\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\Arun\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
estimator._validate_params()
File "C:\Users\Arun\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
validate_parameter_constraints(
File "C:\Users\Arun\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of DecisionTreeClassifier must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got 'auto' instead.
warnings.warn(some_fits_failed_message, FitFailedWarning)
C:\Users\Arun\anaconda3\Lib\site-packages\sklearn\model_selection\_search.py:976: UserWarning: One or more of the test scores are non-finite: [ nan nan 0.84013704 0.84183916 0.83588172 0.84013704
nan nan 0.83673278 0.84013704 0.83843491 0.83928597
nan nan 0.84693112 0.84269023 0.84692752 0.83929318
nan nan 0.83501262 0.83843491 0.83419401 0.83504508
nan nan 0.82228994 0.82908763 0.84183195 0.84098449
nan nan 0.84013704 0.84013704 0.84013704 0.84013704
nan nan 0.8316264 0.8409881 0.8409881 0.84013704
nan nan 0.82993869 0.84439235 0.84862964 0.83758024
nan nan 0.84097367 0.84182834 0.84607645 0.84013704
nan nan 0.84353408 0.83673999 0.84609088 0.84012982]
warnings.warn(
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
param_grid={'criterion': ['gini', 'entropy'],
'max_depth': [1, 2, 3, 4, 5],
'max_features': ['auto', 'sqrt', 'log2'],
'splitter': ['best', 'random']},
scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
param_grid={'criterion': ['gini', 'entropy'],
'max_depth': [1, 2, 3, 4, 5],
'max_features': ['auto', 'sqrt', 'log2'],
'splitter': ['best', 'random']},
scoring='accuracy')DecisionTreeClassifier()
DecisionTreeClassifier()
grid_search.best_params_
{'criterion': 'entropy',
'max_depth': 3,
'max_features': 'log2',
'splitter': 'best'}
dtc_cv = DecisionTreeClassifier(criterion='entropy',max_depth=3,max_features='log2',splitter='best')
dtc_cv.fit(x_train,y_train)
DecisionTreeClassifier(criterion='entropy', max_depth=3, max_features='log2')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(criterion='entropy', max_depth=3, max_features='log2')
pred2 = dtc_cv.predict(x_test)
accuracy_score(y_test,pred2)
0.8435374149659864
pd.crosstab(y_test,pred2)
| col_0 | No | Yes |
|---|---|---|
| Attrition | ||
| No | 242 | 3 |
| Yes | 43 | 6 |
print(classification_report(y_test,pred2))
precision recall f1-score support
No 0.85 0.99 0.91 245
Yes 0.67 0.12 0.21 49
accuracy 0.84 294
macro avg 0.76 0.56 0.56 294
weighted avg 0.82 0.84 0.80 294
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
forest_params = [{'max_depth':list(range(10,15)),'max_features':list(range(0,14))}]
rfc_cv = GridSearchCV(rfc,param_grid=forest_params,cv=10,scoring='accuracy')
rfc_cv.fit(x_train,y_train)
C:\Users\Arun\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py:425: FitFailedWarning:
50 fits failed out of a total of 700.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\Arun\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\Arun\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
estimator._validate_params()
File "C:\Users\Arun\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
validate_parameter_constraints(
File "C:\Users\Arun\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of RandomForestClassifier must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got 0 instead.
warnings.warn(some_fits_failed_message, FitFailedWarning)
C:\Users\Arun\anaconda3\Lib\site-packages\sklearn\model_selection\_search.py:976: UserWarning: One or more of the test scores are non-finite: [ nan 0.85035492 0.85714182 0.86055338 0.86309576 0.86478343
0.85883674 0.86052441 0.86477618 0.85966247 0.86307403 0.85967695
0.86050992 0.86306678 nan 0.84779082 0.85714907 0.85800377
0.86222657 0.86054614 0.85966971 0.85882225 0.86221208 0.86051717
0.85966971 0.85881501 0.85371578 0.85966971 nan 0.85119513
0.86054614 0.85884398 0.85543242 0.85967695 0.85967695 0.85627264
0.86053165 0.85882949 0.86136462 0.86307403 0.86221208 0.85626539
nan 0.85034767 0.85629436 0.86140084 0.85543242 0.85885122
0.85626539 0.86137911 0.86392873 0.86305954 0.85966971 0.86136462
0.85882949 0.8621976 nan 0.85120962 0.85969868 0.86137911
0.86394321 0.8605389 0.85966971 0.85714182 0.85627264 0.86051717
0.86137911 0.85796755 0.85371578 0.85882949]
warnings.warn(
GridSearchCV(cv=10, estimator=RandomForestClassifier(),
param_grid=[{'max_depth': [10, 11, 12, 13, 14],
'max_features': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13]}],
scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=10, estimator=RandomForestClassifier(),
param_grid=[{'max_depth': [10, 11, 12, 13, 14],
'max_features': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13]}],
scoring='accuracy')RandomForestClassifier()
RandomForestClassifier()
pred3 = rfc_cv.predict(x_test)
accuracy_score(y_test,pred3)
0.8503401360544217
pd.crosstab(y_test,pred3)
| col_0 | No | Yes |
|---|---|---|
| Attrition | ||
| No | 242 | 3 |
| Yes | 41 | 8 |
print(classification_report(y_test,pred3))
precision recall f1-score support
No 0.86 0.99 0.92 245
Yes 0.73 0.16 0.27 49
accuracy 0.85 294
macro avg 0.79 0.58 0.59 294
weighted avg 0.83 0.85 0.81 294